{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Clustering text documents using k-means\n\n\nThis is an example showing how the scikit-learn can be used to cluster\ndocuments by topics using a bag-of-words approach. This example uses\na scipy.sparse matrix to store the features instead of standard numpy arrays.\n\nTwo feature extraction methods can be used in this example:\n\n  - TfidfVectorizer uses a in-memory vocabulary (a python dict) to map the most\n    frequent words to features indices and hence compute a word occurrence\n    frequency (sparse) matrix. The word frequencies are then reweighted using\n    the Inverse Document Frequency (IDF) vector collected feature-wise over\n    the corpus.\n\n  - HashingVectorizer hashes word occurrences to a fixed dimensional space,\n    possibly with collisions. The word count vectors are then normalized to\n    each have l2-norm equal to one (projected to the euclidean unit-ball) which\n    seems to be important for k-means to work in high dimensional space.\n\n    HashingVectorizer does not provide IDF weighting as this is a stateless\n    model (the fit method does nothing). When IDF weighting is needed it can\n    be added by pipelining its output to a TfidfTransformer instance.\n\nTwo algorithms are demoed: ordinary k-means and its more scalable cousin\nminibatch k-means.\n\nAdditionally, latent semantic analysis can also be used to reduce\ndimensionality and discover latent patterns in the data.\n\nIt can be noted that k-means (and minibatch k-means) are very sensitive to\nfeature scaling and that in this case the IDF weighting helps improve the\nquality of the clustering by quite a lot as measured against the \"ground truth\"\nprovided by the class label assignments of the 20 newsgroups dataset.\n\nThis improvement is not visible in the Silhouette Coefficient which is small\nfor both as this measure seem to suffer from the phenomenon called\n\"Concentration of Measure\" or \"Curse of Dimensionality\" for high dimensional\ndatasets such as text data. Other measures such as V-measure and Adjusted Rand\nIndex are information theoretic based evaluation scores: as they are only based\non cluster assignments rather than distances, hence not affected by the curse\nof dimensionality.\n\nNote: as k-means is optimizing a non-convex objective function, it will likely\nend up in a local optimum. Several runs with independent random init might be\nnecessary to get a good convergence.\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n#         Lars Buitinck\n# License: BSD 3 clause\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\"--lsa\",\n              dest=\"n_components\", type=\"int\",\n              help=\"Preprocess documents with latent semantic analysis.\")\nop.add_option(\"--no-minibatch\",\n              action=\"store_false\", dest=\"minibatch\", default=True,\n              help=\"Use ordinary k-means algorithm (in batch mode).\")\nop.add_option(\"--no-idf\",\n              action=\"store_false\", dest=\"use_idf\", default=True,\n              help=\"Disable Inverse Document Frequency feature weighting.\")\nop.add_option(\"--use-hashing\",\n              action=\"store_true\", default=False,\n              help=\"Use a hashing feature vectorizer\")\nop.add_option(\"--n-features\", type=int, default=10000,\n              help=\"Maximum number of features (dimensions)\"\n                   \" to extract from text.\")\nop.add_option(\"--verbose\",\n              action=\"store_true\", dest=\"verbose\", default=False,\n              help=\"Print progress reports inside k-means algorithm.\")\n\nprint(__doc__)\nop.print_help()\n\n\ndef is_interactive():\n    return not hasattr(sys.modules['__main__'], '__file__')\n\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n    op.error(\"this script takes no arguments.\")\n    sys.exit(1)\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n    'alt.atheism',\n    'talk.religion.misc',\n    'comp.graphics',\n    'sci.space',\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(subset='all', categories=categories,\n                             shuffle=True, random_state=42)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset \"\n      \"using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n    if opts.use_idf:\n        # Perform an IDF normalization on the output of HashingVectorizer\n        hasher = HashingVectorizer(n_features=opts.n_features,\n                                   stop_words='english', alternate_sign=False,\n                                   norm=None)\n        vectorizer = make_pipeline(hasher, TfidfTransformer())\n    else:\n        vectorizer = HashingVectorizer(n_features=opts.n_features,\n                                       stop_words='english',\n                                       alternate_sign=False, norm='l2')\nelse:\n    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,\n                                 min_df=2, stop_words='english',\n                                 use_idf=opts.use_idf)\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n    print(\"Performing dimensionality reduction using LSA\")\n    t0 = time()\n    # Vectorizer results are normalized, which makes KMeans behave as\n    # spherical k-means for better results. Since LSA/SVD results are\n    # not normalized, we have to redo the normalization.\n    svd = TruncatedSVD(opts.n_components)\n    normalizer = Normalizer(copy=False)\n    lsa = make_pipeline(svd, normalizer)\n\n    X = lsa.fit_transform(X)\n\n    print(\"done in %fs\" % (time() - t0))\n\n    explained_variance = svd.explained_variance_ratio_.sum()\n    print(\"Explained variance of the SVD step: {}%\".format(\n        int(explained_variance * 100)))\n\n    print()\n\n\n# #############################################################################\n# Do the actual clustering\n\nif opts.minibatch:\n    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,\n                         init_size=1000, batch_size=1000, verbose=opts.verbose)\nelse:\n    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,\n                verbose=opts.verbose)\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\"\n      % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\"Silhouette Coefficient: %0.3f\"\n      % metrics.silhouette_score(X, km.labels_, sample_size=1000))\n\nprint()\n\n\nif not opts.use_hashing:\n    print(\"Top terms per cluster:\")\n\n    if opts.n_components:\n        original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n        order_centroids = original_space_centroids.argsort()[:, ::-1]\n    else:\n        order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n    terms = vectorizer.get_feature_names()\n    for i in range(true_k):\n        print(\"Cluster %d:\" % i, end='')\n        for ind in order_centroids[i, :10]:\n            print(' %s' % terms[ind], end='')\n        print()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}